modelr::Simple Simulated Datasets
Datasets
sim1 %>%
ggplot() +
geom_point(aes(x, y))
ggplot(sim2) +
geom_point(aes(x, y, color = x))
ggplot(sim3) +
geom_point(aes(x1, y, color = x2))
ggplot(sim4) +
geom_point(aes(x1, y, color = x2))
Slop-Intercept Line Model: Model Family and Coefficients
With random models
(random_models <- tibble(
a = runif(250, -5, 5),
b = runif(250, -20, 40)
))
cat(
"Generated Points Pairs:",
"\nlength(random_models$a):",
length(random_models$a),
"\nlength(random_models$b):",
length(random_models$b)
)
## Generated Points Pairs:
## length(random_models$a): 250
## length(random_models$b): 250
random_models %>%
mutate(rownum = row_number()) %>%
ggplot() + geom_point(aes(rownum, a), color = "brown") +
geom_point(aes(rownum, b), color = "violetred1")
# Plot the models as slop-intercept form lines
ggplot(sim1) +
geom_abline(data = random_models, mapping = aes(intercept = b, slope = a), alpha = 1/4) +
geom_point(aes(x, y))
abline_model<- function(mod, data) {
data$x * mod[1] + mod[2]
}
distance_deviation <- function(mod, data) {
diff <- data$y - abline_model(mod, data)
sqrt(mean(diff ^ 2))
}
(
measured_dist <- random_models %>%
mutate(distance_deviation = map2_dbl(a, b, function(a, b) {
distance_deviation(c(a, b), sim1)
})) %>%
arrange(distance_deviation)
)
ggplot(sim1) +
geom_point(size = 2, color = "red", mapping = aes(x, y)) +
# Overlay the best model onto the data
geom_abline(
data = filter(measured_dist, rank(distance_deviation) <= 1),
aes(intercept = b, slope = a, colour = -distance_deviation)
)
Visualize distance deviation across all models:
ggplot(measured_dist, aes(a, b)) +
geom_point(data = filter(measured_dist, rank(distance_deviation) <= 10), color = "red", size = 4) +
geom_point(aes(color = -distance_deviation)) +
xlab("a (slope)") + ylab("b (intercept)")
Grid Search
(grid_models <- expand.grid(
a = seq(1, 3, length = 25),
b = seq(-5, 20, length = 25)
)) %>%
ggplot(aes(a, b)) + geom_point() +
xlab("a (slope)") + ylab("b (intercept)")
# Search the best 10 models in the generated evenly spaced grid of points
(measured_dist <- grid_models %>%
mutate(distance_deviation = map2_dbl(a, b, function(a, b) {
distance_deviation(c(a, b), sim1)
}))) %>%
ggplot() +
geom_point(
data = filter(measured_dist, rank(distance_deviation) < 10 ),
mapping = aes(a, b), color = "red", size = 4
) +
geom_point(data = measured_dist, aes(a, b, color = -distance_deviation)) +
xlab("a (slope)") + ylab("b (intercept)")
Then visualize the best 9 models by overlaying them onto the original data:
ggplot(sim1, aes(x, y)) +
geom_abline(
data = filter(measured_dist, rank(distance_deviation) < 9),
mapping = aes(slope = a, intercept = b, color = factor(distance_deviation))
) +
geom_point(color = "grey30")
Newton-Raphson Search
(optimized_model <- optim(
c(0, 0),
fn = distance_deviation,
data = sim1
))
## $par
## [1] 2.051204 4.222248
##
## $value
## [1] 2.128181
##
## $counts
## function gradient
## 77 NA
##
## $convergence
## [1] 0
##
## $message
## NULL
# Visualize this model onto original data
ggplot() +
geom_abline(
slope = optimized_model$par[1],
intercept = optimized_model$par[2],
color = "red"
) +
geom_point(data = sim1, mapping = aes(x, y), color = "grey30")
ggplot(sim1, aes(x)) +
geom_line(
data = data_grid(sim1, x) %>%
mutate(
pred = optimized_model$par[1] * x + optimized_model$par[2]
),
mapping = aes(y = pred),
color = "red", size = 1
) +
geom_point(aes(y = y), color = "grey30")
sim1 %>%
mutate(
pred = optimized_model$par[1] * x + optimized_model$par[2],
resid = y - pred
)
General Linear Model
\[ y = a_1 + a_2\cdot x_1 + a_3 \cdot x_2 + ... a_n \cdot x_{n-1} \] , which could be applied here as the case of \(n = 2 \Rightarrow y \sim x\).
In the linear formula above, \(y\) denotes the dependent variable and \(x\)s are the independent variables.
linear_model = lm(y ~ x, data = sim1)
# Extract Model Coefficients
coefficients(linear_model) # short as: coef(linear_model)
## (Intercept) x
## 4.220822 2.051533
cat(
"Extract Intercept:",
coef(linear_model)[[1]],
"\nExtract Slope: ",
coef(linear_model)[[2]]
)
## Extract Intercept: 4.220822
## Extract Slope: 2.051533
, where y ~ x would be translated as a function like y = ax + b.
sim1 %>%
data_grid(x) %>%
add_predictions(model = linear_model) %>%
ggplot(aes(x, pred)) +
geom_line(color = "red", size = 1) +
geom_point(data = sim1, mapping = aes(x, y))
Because the generated data grid of points are frome the original data instead of a manually defined array, this is an evenly spaced grid against regular spaced grid.
Residuals are the distances between the observed and predicted values computed above.(Wickham 2010)
sim1 %>%
add_residuals(linear_model) %>%
ggplot() + geom_freqpoly(aes(x = resid), bins = 30)
, where the occasions with the residual around 0 is the largest. The largest count around 0 shows that the average of the residual is 0.
sim1 %>%
add_residuals(linear_model) %>%
ggplot() +
geom_ref_line(h = 0) +
geom_point(aes(x, resid))
Categorical Predictor
The Classical Effects Model
(“The Lm() Function with Categorical Predictors,” n.d.)
Mock a set of categorical data for understanding the generation model of categorical data:
set.seed(10)
n <- 500
sigma <- 2
dummy_data <- tibble(
category_i = c(rep("category_1", n), rep("category_2", n), rep("category_3", n)),
j = c(1:n, 1:n, 1:n),
# Set the means of 8, 9.5 and 11 to each category
y = c(4 + sigma * rnorm(n), 7.5 + sigma * rnorm(n), 11 + sigma * rnorm(n))
)
dummy_data %>%
ggplot() + geom_ref_line(v = c(4, 7.5, 11)) +
geom_histogram(
# identity position is underlied here, instead of default stack positioning
aes(x = y, fill = category_i), position = "identity", binwidth = 0.5, alpha = 1/3
)
dummy_data %>%
ggplot() + geom_point(
aes(j, y, color = category_i)
)
# View this dataset in Matrix:
dummy_data %>%
tidyr::spread(key = category_i, value = y)
Let \(i\) the number of the category_i, and \(j\) the number of an observation within that category, we could have
- \(i\) is available as \(\{1, 2, 3\}\)
- \(j\) is available between \([1, 500]\)
- Each entry’s value
ycould be written as \(y_{ji}\)
\
\[\begin{cases} \text{category_1:} & y_{j1} = \mu + \tau_1 + \epsilon_{j1} \\ \text{category_2:} & y_{j2} = \mu + \tau_2 + \epsilon_{j2} \\ \text{category_3:} & y_{j3} = \mu + \tau_3 + \epsilon_{j3} \end{cases}\]$$ However the parameters above, \(\mu\) and \(\tau_1\) \(\tau_2\) \(\tau_3\), are not estimable, for which the Classical Effects Model is saied Overparameterized.
But the difference in means between categories could become estimable, where R’s lm() function uses a reparameterization called reference cell model, where one of the \(\tau_i\)s, actually the first item \(\tau_1\), is set to zero to allow for a solution: \[
\begin{align}
\tau_1 &= 0 \Rightarrow \overline{\text{Category_1}} = \mu + 0
& \text{, written as } \mu^* \\
\overline{\text{Category_2}} - \overline{\text{Category_1}}
&= (\mu + \tau_2) - (\mu + \tau_1) \\
&= \tau_2 - \tau_1 = \tau_2 - 0 & \text{, written as } \tau_2^*\\
\overline{\text{Category_3}} - \overline{\text{Category_1}}
&= (\mu + \tau_3) - (\mu + \tau_1) \\
&= \tau_3 - \tau_1 = \tau_3 - 0 & \text{, written as } \tau_3^*\\
\end{align}
\] With the symbols from the difference listed above, the means of each category could be rewritten again: \[
\begin{align}
\overline{\text{Category_1}} &= \mu + \tau_1 = \mu^* \\
\overline{\text{Category_2}} &= \mu + \tau_2 = \mu^* + \tau_2^* \\
\overline{\text{Category_3}} &= \mu + \tau_3 = \mu^* + \tau_3^* \\
\end{align}
\] , where the \(p\)-values for these tests are more likely to be meaningful as well.
All of the symbols assigned above could be shown and explain R’s lm() modeling function:
summary(
lm(y ~ category_i, data = dummy_data)
)
##
## Call:
## lm(formula = y ~ category_i, data = dummy_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.0916 -1.3604 -0.0147 1.4461 7.5888
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.97825 0.09139 43.53 <2e-16 ***
## category_icategory_2 3.58899 0.12924 27.77 <2e-16 ***
## category_icategory_3 7.05809 0.12924 54.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.044 on 1497 degrees of freedom
## Multiple R-squared: 0.6658, Adjusted R-squared: 0.6654
## F-statistic: 1491 on 2 and 1497 DF, p-value: < 2.2e-16
where,
- \(\text{Residual standard error} = 2.044\) is exactly close to our preset \(\text{sigma} = 2\).
- the
(Intercept)is indeed the mean ofcategory_1 category_2’s coefficient is \(3.98 + 3.59 = 7.57\), which is extremely close to the preset value of \(7.5\)category_3’s coefficient is \(3.98 + 7.05 = 11.03\), which is exactly close the preset value of 11 indummy_data
Explain Sim2
summary(categorical_lm <- lm(y ~ x, data = sim2))
##
## Call:
## lm(formula = y ~ x, data = sim2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.40131 -0.43996 -0.05776 0.49066 2.63938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1522 0.3475 3.316 0.00209 **
## xb 6.9639 0.4914 14.171 2.68e-16 ***
## xc 4.9750 0.4914 10.124 4.47e-12 ***
## xd 0.7588 0.4914 1.544 0.13131
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.099 on 36 degrees of freedom
## Multiple R-squared: 0.8852, Adjusted R-squared: 0.8756
## F-statistic: 92.52 on 3 and 36 DF, p-value: < 2.2e-16
# Calculate the Root Mean Squared Error(RMSE)
cat("RMSE: ",
RMSE <-
sqrt(sum(residuals(categorical_lm) ^ 2) / df.residual(categorical_lm))
)
## RMSE: 1.098861
(categorical_pred_summary <- sim2 %>%
data_grid(x) %>%
add_predictions(categorical_lm))
sim2 %>%
data_grid(x) %>%
add_predictions(categorical_lm) %>%
ggplot() +
geom_point(data = sim2, mapping = aes(x, y, color = x)) +
# Visualize the coefficients of lm modeling
geom_ref_line(
h = coef(categorical_lm)[["(Intercept)"]],
colour = "olivedrab2"
) +
geom_ref_line(
h = coef(categorical_lm)[["(Intercept)"]] + coef(categorical_lm)[["xb"]],
colour = "olivedrab2"
) +
geom_ref_line(
h = coef(categorical_lm)[["(Intercept)"]] + coef(categorical_lm)[["xc"]],
colour = "olivedrab2"
) +
geom_ref_line(
h = coef(categorical_lm)[["(Intercept)"]] + coef(categorical_lm)[["xd"]],
colour = "olivedrab2"
) +
geom_crossbar(aes(
# Visualize the RMSE
x, pred, ymin = pred - RMSE, ymax = pred + RMSE
), data = categorical_pred_summary, width = 0.2)
, where
- \(x\) describes 4 categories and can take values
('a', 'b', 'c', 'd') - To interpret the coefficients above, Let the categorical variable \(x\) be into a dummy variable which takes values
(0, 1, 2, 3)for('a', 'b', 'c', 'd')correspondingly,- average \(y\) is higher by 6.9639 units for
bthan fora, all other variables held constant. - average \(y\) is higher by 4.9750 units for
cthan fora, all other variables held constant. - average \(y\) is higher by 0.7588 units for
dthan fora, all other variables held constant.
- average \(y\) is higher by 6.9639 units for
About the coefficients interpreting: https://towardsdatascience.com/interpreting-the-coefficients-of-linear-regression-cc31d4c6f235
Explain Sim3
(gathered_model <- sim3 %>%
data_grid(x1, x2) %>%
gather_predictions(
lm(y ~ x1 + x2, sim3),
lm(y ~ x1 * x2, sim3),
lm(y ~ x1 + x2 + x1 * x2, sim3)
))
ggplot(sim3) +
geom_point(aes(x1, y, color = x2)) +
geom_line(
data = gathered_model,
aes(x1, pred, color = x2
# group = paste(x2, model)
)
) +
facet_wrap(~ model)
sim3 %>%
gather_residuals(
lm(y ~ x1 + x2, sim3),
lm(y ~ x1 * x2, sim3)
) %>%
ggplot() +
geom_point(aes(x1, resid, color = x2)) +
facet_grid(model ~ x2)
The facets by both model above shows that: * little obvious pattern in the residuals for model,
y ~ x1 * x2 * Some Pattern has been clearly missed for model, y ~ x1 + x2, in the group x2 == b
Two Continuous Predictor
(predicts <- sim4 %>%
data_grid(
# Generates a 5 x 5 = 25 entries encountered table
x1 = seq_range(x1, 5),
x2 = seq_range(x2, 5)
) %>%
gather_predictions(
lm(y ~ x1 + x2, data = sim4),
lm(y ~ x1 * x2, data = sim4)
# lm(y ~ x1 * x2, data = sim4, pretty = TRUE)
))
# Visualize the two predictions as 3d surfaces
ggplot(predicts) + geom_tile(aes(x = x1, y = x2, fill = pred)) +
facet_wrap(~ model)
# Visualize the the surfaces in multile slices
ggplot(predicts) +
geom_line(aes(x = x1, y = pred, color = x2, group = x2)) +
facet_wrap(~ model)
# Visualize the the surfaces in multile slices
ggplot(predicts) +
geom_line(aes(x = x2, y = pred, color = x1, group = x1)) +
facet_wrap(~ model)
Because it is a manually specified array here without any dependency on data itself, this is a regularly spaced grid of values created in the code above.
The lines showing slices of the tiles are much more easy for comparing shades of the colors in tile graph that not suggest much difference between the two models.
References
“The Lm() Function with Categorical Predictors.” n.d. https://www.r-bloggers.com/the-lm-function-with-categorical-predictors/.
Wickham, Hadley. 2010. “A Layered Grammar of Graphics.” Journal of Computational and Graphical Statistics. https://www.researchgate.net/publication/228842388_A_Layered_Grammar_of_Graphics.